{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dressmaker - Medium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading spark-stubs, spark-hive\n",
      "Adding Hive conf dir /opt/hive/conf to classpath\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SLF4J: No SLF4J providers were found.\n",
      "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
      "SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a target=\"_blank\" href=\"http://jupyter:4040\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.types._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.expressions.Window\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@7fed1f9a"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:3.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark._\n",
    "import org.apache.spark.sql._\n",
    "import org.apache.spark.sql.types._\n",
    "import org.apache.spark.sql.functions._\n",
    "import org.apache.spark.sql.expressions.Window\n",
    "\n",
    "val spark = {\n",
    "    NotebookSparkSession.builder()\n",
    "    .progress(false)\n",
    "    .appName(\"app17-2\")\n",
    "    // .master(\"spark://192.168.31.31:7077\")\n",
    "    .master(\"local[*]\")\n",
    "    .config(\"spark.sql.warehouse.dir\", \n",
    "            \"hdfs://192.168.31.31:9000/user/hive/warehouse\") \n",
    "    .config(\"spark.cores.max\", \"4\") \n",
    "    .config(\"spark.executor.instances\", \"1\") \n",
    "    .config(\"spark.executor.cores\", \"2\") \n",
    "    .config(\"spark.executor.memory\", \"10g\") \n",
    "    .config(\"spark.shuffle.service.enabled\", \"false\") \n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\") \n",
    "    .config(\"spark.sql.catalogImplementation\", \"hive\")\n",
    "    .config(\"spark.sql.repl.eagerEval.enabled\", \"true\")\n",
    "    .config(\"spark.driver.allowMultipleContexts\", \"true\")\n",
    "    .getOrCreate()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36mspark.sqlContext.implicits._\n",
       "\u001b[39m\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m\n",
       "\u001b[36mhiveCxt\u001b[39m: \u001b[32msql\u001b[39m.\u001b[32mhive\u001b[39m.\u001b[32mHiveContext\u001b[39m = org.apache.spark.sql.hive.HiveContext@2361e992"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "import spark.sqlContext.implicits._\n",
    "def sc = spark.sparkContext\n",
    "val hiveCxt = new org.apache.spark.sql.hive.HiveContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mRichDF\u001b[39m"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Credit to Aivean\n",
    "implicit class RichDF(val ds:DataFrame) {\n",
    "    def showHTML(limit: Int = 50, truncate: Int = 100) = {\n",
    "        import xml.Utility.escape\n",
    "        val data = ds.take(limit)\n",
    "        val header = ds.schema.fieldNames.toSeq        \n",
    "        val rows: Seq[Seq[String]] = data.map { row =>\n",
    "          row.toSeq.map {cell =>\n",
    "            val str = cell match {\n",
    "              case null => \"null\"\n",
    "              case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "              case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "              case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "              case _ => cell.toString\n",
    "            }\n",
    "            if (truncate > 0 && str.length > truncate) {\n",
    "              // do not show ellipses for strings shorter than 4 characters.\n",
    "              if (truncate < 4) str.substring(0, truncate)\n",
    "              else str.substring(0, truncate - 3) + \"...\"\n",
    "            } else {\n",
    "              str\n",
    "            }\n",
    "          }: Seq[String]\n",
    "        }\n",
    "    publish.html(s\"\"\" <table>\n",
    "                <tr>\n",
    "                 ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "                </tr>\n",
    "                ${rows.map {row =>\n",
    "                  s\"<tr>${row.map{c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "                }.mkString}\n",
    "            </table>\n",
    "        \"\"\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[36mjmcust\u001b[39m: \u001b[32mDataFrame\u001b[39m = [c_no: int, c_name: string ... 2 more fields]\n",
       "\u001b[36mdressmaker\u001b[39m: \u001b[32mDataFrame\u001b[39m = [d_no: int, d_name: string ... 2 more fields]\n",
       "\u001b[36mdress_order\u001b[39m: \u001b[32mDataFrame\u001b[39m = [order_no: int, cust_no: int ... 2 more fields]\n",
       "\u001b[36mconstruction\u001b[39m: \u001b[32mDataFrame\u001b[39m = [maker: int, order_ref: int ... 3 more fields]\n",
       "\u001b[36mquantities\u001b[39m: \u001b[32mDataFrame\u001b[39m = [style_q: int, size_q: int ... 1 more field]\n",
       "\u001b[36morder_line\u001b[39m: \u001b[32mDataFrame\u001b[39m = [order_ref: int, line_no: int ... 3 more fields]\n",
       "\u001b[36mgarment\u001b[39m: \u001b[32mDataFrame\u001b[39m = [style_no: int, description: string ... 2 more fields]\n",
       "\u001b[36mmaterial\u001b[39m: \u001b[32mDataFrame\u001b[39m = [material_no: int, fabric: string ... 3 more fields]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val jmcust = hiveCxt.table(\"sqlzoo.jmcust\")\n",
    "val dressmaker = hiveCxt.table(\"sqlzoo.dressmaker\")\n",
    "val dress_order = hiveCxt.table(\"sqlzoo.dress_order\")\n",
    "val construction = hiveCxt.table(\"sqlzoo.construction\")\n",
    "val quantities = hiveCxt.table(\"sqlzoo.quantities\")\n",
    "val order_line = hiveCxt.table(\"sqlzoo.order_line\")\n",
    "val garment = hiveCxt.table(\"sqlzoo.garment\")\n",
    "val material = hiveCxt.table(\"sqlzoo.material\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.\n",
    "Assuming that any garment could be made in any of the available materials, list the garments (description, fabric, colour and pattern) which are expensive to make, that is, those for which the labour costs are 80% or more of the total cost."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>description</th><th>fabric</th><th>colour</th><th>pattern</th>\n",
       "                </tr>\n",
       "                <tr><td>Trousers            </td><td>Cotton              </td><td>Yellow Stripe       </td><td>Woven               </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Yellow Stripe       </td><td>Woven               </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Yellow Stripe       </td><td>Woven               </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Yellow Stripe       </td><td>Woven               </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Green Stripe        </td><td>Woven               </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Green Stripe        </td><td>Woven               </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Green Stripe        </td><td>Woven               </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Green Stripe        </td><td>Woven               </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Black Dotted        </td><td>Woven               </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Black Dotted        </td><td>Woven               </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Black Dotted        </td><td>Woven               </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Black Dotted        </td><td>Woven               </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Red Stripe          </td><td>Woven               </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Red Stripe          </td><td>Woven               </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Red Stripe          </td><td>Woven               </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Red Stripe          </td><td>Woven               </td></tr><tr><td>Trousers            </td><td>Polyester           </td><td>Pale Yellow         </td><td>Printed             </td></tr><tr><td>Long Skirt          </td><td>Polyester           </td><td>Pale Yellow         </td><td>Printed             </td></tr><tr><td>Short Skirt         </td><td>Polyester           </td><td>Pale Yellow         </td><td>Printed             </td></tr><tr><td>Sundress            </td><td>Polyester           </td><td>Pale Yellow         </td><td>Printed             </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Blue Stripe         </td><td>Woven               </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Blue Stripe         </td><td>Woven               </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Blue Stripe         </td><td>Woven               </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Blue Stripe         </td><td>Woven               </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Pink Check          </td><td>Woven               </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Pink Check          </td><td>Woven               </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Pink Check          </td><td>Woven               </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Pink Check          </td><td>Woven               </td></tr><tr><td>Trousers            </td><td>Rayon               </td><td>Red/Orange          </td><td>Printed             </td></tr><tr><td>Sundress            </td><td>Rayon               </td><td>Red/Orange          </td><td>Printed             </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Blue Abstract       </td><td>Printed             </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Blue Abstract       </td><td>Printed             </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Blue Abstract       </td><td>Printed             </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Blue Abstract       </td><td>Printed             </td></tr><tr><td>Trousers            </td><td>Cotton              </td><td>Green Abstract      </td><td>Printed             </td></tr><tr><td>Long Skirt          </td><td>Cotton              </td><td>Green Abstract      </td><td>Printed             </td></tr><tr><td>Short Skirt         </td><td>Cotton              </td><td>Green Abstract      </td><td>Printed             </td></tr><tr><td>Sundress            </td><td>Cotton              </td><td>Green Abstract      </td><td>Printed             </td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(garment.join(material)\n",
    " .filter(col(\"labour_cost\")/(col(\"labour_cost\")+col(\"cost\"))>0.8) \n",
    " .select(\"description\", \"fabric\", \"colour\", \"pattern\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.\n",
    "List the descriptions and the number of orders of the less popular garments, that is those for which less than the average number of orders per garment have been placed. Also print out the average number of orders per garment. When calculating the average, ignore any garments for which no orders have been made."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>description</th><th>n_orders</th><th>avg_orders</th>\n",
       "                </tr>\n",
       "                <tr><td>Sundress            </td><td>5</td><td>5.166666666666667</td></tr><tr><td>Short Skirt         </td><td>5</td><td>5.166666666666667</td></tr><tr><td>Shorts              </td><td>5</td><td>5.166666666666667</td></tr><tr><td>Suntop              </td><td>4</td><td>5.166666666666667</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(garment.join(order_line, (garment(\"style_no\") === order_line(\"ol_style\")))\n",
    " .groupBy(\"style_no\", \"description\")\n",
    " .agg(count(\"order_ref\").alias(\"n_orders\"))\n",
    " .withColumn(\"avg_orders\", avg(col(\"n_orders\")).over(Window.partitionBy(lit(0))))\n",
    " .filter(col(\"n_orders\") < col(\"avg_orders\"))\n",
    " .select(\"description\", \"n_orders\", \"avg_orders\")\n",
    " .showHTML()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.\n",
    "Which is the most popular line, that is, the garment with the highest number of orders. Bearing in mind the fact that there may be several such garments, list the garment description(s) and number(s) of orders."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>description</th><th>n_orders</th>\n",
       "                </tr>\n",
       "                <tr><td>Trousers            </td><td>6</td></tr><tr><td>Long Skirt          </td><td>6</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(garment.join(order_line, (garment(\"style_no\") === order_line(\"ol_style\")))\n",
    " .groupBy(\"style_no\", \"description\")\n",
    " .agg(count(\"order_ref\").alias(\"n_orders\"))\n",
    " .withColumn(\"max_orders\", max(\"n_orders\").over(Window.partitionBy(lit(0))))\n",
    " .filter(col(\"n_orders\") === col(\"max_orders\"))\n",
    " .select(\"description\", \"n_orders\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.\n",
    "List the descriptions, and costs of the more expensive size 8, Cotton garments which might be ordered, that is those costing more than the average (labour costs + material costs) to make."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>description</th><th>material_no</th><th>tot_cost</th>\n",
       "                </tr>\n",
       "                <tr><td>Sundress            </td><td>14</td><td>31.2</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(garment.join(order_line, (garment(\"style_no\") === order_line(\"ol_style\")))\n",
    " .join(material, (order_line(\"ol_material\") === material(\"material_no\")))\n",
    " .join(quantities, ((order_line(\"ol_size\") === quantities(\"size_q\")) &&\n",
    "                    (order_line(\"ol_style\") === quantities(\"style_q\"))))\n",
    " .withColumn(\"tot_cost\", round(col(\"labour_cost\") + col(\"quantity\") * col(\"cost\"), 2))\n",
    " .withColumn(\"avg_cost\", avg(col(\"tot_cost\")).over(Window.partitionBy(lit(0))))\n",
    " .filter((col(\"ol_size\") === 8) && (lower(trim(col(\"fabric\"))) === \"cotton\") &&\n",
    "         (col(\"tot_cost\") > col(\"avg_cost\")))\n",
    " .select(\"description\", \"material_no\", \"tot_cost\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.\n",
    "What is the most common size ordered for each garment type? List description, size and number of orders, assuming that there could be several equally popular sizes for each type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>style_no</th><th>description</th><th>ol_size</th><th>max(n_orders)</th>\n",
       "                </tr>\n",
       "                <tr><td>1</td><td>Trousers            </td><td>8</td><td>3</td></tr><tr><td>1</td><td>Trousers            </td><td>14</td><td>1</td></tr><tr><td>1</td><td>Trousers            </td><td>16</td><td>1</td></tr><tr><td>1</td><td>Trousers            </td><td>18</td><td>1</td></tr><tr><td>2</td><td>Long Skirt          </td><td>8</td><td>3</td></tr><tr><td>2</td><td>Long Skirt          </td><td>14</td><td>1</td></tr><tr><td>2</td><td>Long Skirt          </td><td>16</td><td>1</td></tr><tr><td>2</td><td>Long Skirt          </td><td>18</td><td>1</td></tr><tr><td>3</td><td>Shorts              </td><td>8</td><td>1</td></tr><tr><td>3</td><td>Shorts              </td><td>10</td><td>1</td></tr><tr><td>3</td><td>Shorts              </td><td>14</td><td>1</td></tr><tr><td>3</td><td>Shorts              </td><td>16</td><td>1</td></tr><tr><td>3</td><td>Shorts              </td><td>18</td><td>1</td></tr><tr><td>4</td><td>Short Skirt         </td><td>8</td><td>1</td></tr><tr><td>4</td><td>Short Skirt         </td><td>10</td><td>2</td></tr><tr><td>4</td><td>Short Skirt         </td><td>12</td><td>1</td></tr><tr><td>4</td><td>Short Skirt         </td><td>16</td><td>1</td></tr><tr><td>5</td><td>Sundress            </td><td>8</td><td>1</td></tr><tr><td>5</td><td>Sundress            </td><td>10</td><td>2</td></tr><tr><td>5</td><td>Sundress            </td><td>12</td><td>1</td></tr><tr><td>5</td><td>Sundress            </td><td>16</td><td>1</td></tr><tr><td>6</td><td>Suntop              </td><td>8</td><td>1</td></tr><tr><td>6</td><td>Suntop              </td><td>10</td><td>1</td></tr><tr><td>6</td><td>Suntop              </td><td>12</td><td>2</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(garment.join(order_line, (garment(\"style_no\") === order_line(\"ol_style\")))\n",
    " .groupBy(\"style_no\", \"description\", \"ol_size\")\n",
    " .agg(count(\"order_ref\").alias(\"n_orders\"))\n",
    " .groupBy(\"style_no\", \"description\", \"ol_size\")\n",
    " .max(\"n_orders\")\n",
    " .orderBy(\"style_no\", \"description\", \"ol_size\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".sc",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
